In [9]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer
In [2]:
train_texts = pd.read_csv('./fake_news/train.csv').dropna(subset='text').text
test_texts = pd.read_csv('./fake_news/test.csv').dropna(subset='text').text
In [3]:
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

# Retrieve stopwords from all of the available languages into one set
stop_words = set(sum([stopwords.words(language) for language in stopwords.fileids()], []))

def clean(text):
    words = word_tokenize(text)
    words = [stemmer.stem(w.lower()) for w in words if  # Stem and lower
            not w in stop_words  # Not a stop word
            and w.isalpha()]  # Only contains letters
    
    return " ".join(words)
In [4]:
from tqdm import tqdm
tqdm.pandas()

train_texts = train_texts.progress_apply(clean)
test_texts = test_texts.progress_apply(clean)
100%|████████████████████████████████████| 20761/20761 [02:16<00:00, 152.42it/s]
100%|██████████████████████████████████████| 5193/5193 [00:34<00:00, 150.75it/s]
In [29]:
train_vectorizer = TfidfVectorizer(min_df=0.005, max_df=0.6)
X_train = train_vectorizer.fit_transform(train_texts)

X_test = train_vectorizer.transform(test_texts)
In [30]:
X_train.shape, X_test.shape
Out[30]:
((20761, 5551), (5193, 5551))
In [32]:
len_train = X_train.shape[0]
len_test = X_test.shape[0]

combined = np.concatenate([X_train.A, X_test.A])

combined_embedded = TSNE(n_components=2, perplexity=10, init='pca').fit_transform(combined)
train_embedded = combined_embedded[:len_train]
test_embedded = combined_embedded[len_train:]

assert train_embedded.shape[0] == len_train and test_embedded.shape[0] == len_test
/Users/anvil/Documents/Projects/Metaverse_mind_lab_tha/venv/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:805: FutureWarning: The default learning rate in TSNE will change from 200.0 to 'auto' in 1.2.
  warnings.warn(
/Users/anvil/Documents/Projects/Metaverse_mind_lab_tha/venv/lib/python3.8/site-packages/sklearn/manifold/_t_sne.py:991: FutureWarning: The PCA initialization in TSNE will change to have the standard deviation of PC1 equal to 1e-4 in 1.2. This will ensure better convergence.
  warnings.warn(
In [37]:
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default='notebook'

fig = go.Figure()
fig.add_trace(go.Scatter(x=train_embedded[:, 0], y=train_embedded[:, 1], mode='markers'))
fig.add_trace(go.Scatter(x=test_embedded[:, 0], y=test_embedded[:, 1], mode='markers'))
In [ ]: